# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import nltk
# read csv
df = pd.read_csv('/Users/wanghan/Documents/ads-spring2023-project1-wangyeye66/data/philosophy_data.csv')
df.head()
| title | author | school | sentence_spacy | sentence_str | original_publication_date | corpus_edition_date | sentence_length | sentence_lowered | tokenized_txt | lemmatized_str | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Plato - Complete Works | Plato | plato | What's new, Socrates, to make you leave your ... | What's new, Socrates, to make you leave your ... | -350 | 1997 | 125 | what's new, socrates, to make you leave your ... | ['what', 'new', 'socrates', 'to', 'make', 'you... | what be new , Socrates , to make -PRON- lea... |
| 1 | Plato - Complete Works | Plato | plato | Surely you are not prosecuting anyone before t... | Surely you are not prosecuting anyone before t... | -350 | 1997 | 69 | surely you are not prosecuting anyone before t... | ['surely', 'you', 'are', 'not', 'prosecuting',... | surely -PRON- be not prosecute anyone before ... |
| 2 | Plato - Complete Works | Plato | plato | The Athenians do not call this a prosecution b... | The Athenians do not call this a prosecution b... | -350 | 1997 | 74 | the athenians do not call this a prosecution b... | ['the', 'athenians', 'do', 'not', 'call', 'thi... | the Athenians do not call this a prosecution ... |
| 3 | Plato - Complete Works | Plato | plato | What is this you say? | What is this you say? | -350 | 1997 | 21 | what is this you say? | ['what', 'is', 'this', 'you', 'say'] | what be this -PRON- say ? |
| 4 | Plato - Complete Works | Plato | plato | Someone must have indicted you, for you are no... | Someone must have indicted you, for you are no... | -350 | 1997 | 101 | someone must have indicted you, for you are no... | ['someone', 'must', 'have', 'indicted', 'you',... | someone must have indict -PRON- , for -PRON- ... |
To explore the data, we pick three most important features: title, author and school. By plotting the distribution, we clearly found that Aristotle, Plato and Hegel are the top three authors in our dataset. That makes sense, since they are so well-known. When it comes to the schools, the top three schools are analytic, aristotle, and german idealism in our dataset.
# categorical data distributions
features = ['title', 'author', 'school']
for f in features:
plt.figure(figsize=(16,10), dpi = 80)
df[f].value_counts().plot(kind = 'bar')
plt.title(f)
plt.grid()
plt.show()
Now, lets focus on numerical data. Sentence length is an important feature in this dataset. The median sentence length in this data set is 127 words. The longest sentense is 2649 words! I seems philosphers love to say long sentences. It is also interesting to find that the sentence length is about distributed normally with log-scale. The result is what I expected.
# show distributions of numerical data before moving forward
print(df.sentence_length.describe())
plt.figure(figsize=(12,5))
sns.displot(df['sentence_length'], kde =True)
plt.title('Sentence Length Distribution')
plt.xlabel('Sentence Length')
plt.ylabel('Counts')
# plt.savefig('sentence_length_dist.png', format='png', dpi=80, bbox_inches='tight')
# looks like a normal distribution
plt.figure(figsize=(12,5))
sns.displot(df['sentence_length'],log_scale = True, kde= True, bins=50)
plt.title('Sentence Length Distribution with log-scale')
plt.xlabel('Sentence Length')
plt.ylabel('Counts')
# plt.savefig('sentence_length_dist_log.png', format='png', dpi=80, bbox_inches='tight')
count 360808.000000 mean 150.790964 std 104.822072 min 20.000000 25% 75.000000 50% 127.000000 75% 199.000000 max 2649.000000 Name: sentence_length, dtype: float64
Text(-12.805555555555555, 0.5, 'Counts')
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
The distribution of corpus edition date is cluster at year 1990 to 2010.
# corpus edition date cluster around year 1990~2010
sns.displot(df['corpus_edition_date'],kde = True, bins = 100)
<seaborn.axisgrid.FacetGrid at 0x7f834b4f0c10>
To answer this, I averaged up the sentence length for each author and displayed in a horizontal bar plot. Then, rank the averaged value from the highest to lowest. It is clearly shown on the figure that, Descartes is the most verbose Author among all philosophers in our dataset.
# sentence length comparison over authors
avg_len = df['sentence_length'].groupby(df['author']).mean()
avg_len_df = pd.DataFrame({'author': avg_len.index, 'length':avg_len.values})
avg_len_df = avg_len_df.sort_values(by = ['length'], ascending = False)
avg_len_df
plt.figure(figsize=(15,10), dpi = 80)
ax = sns.barplot(x = 'length',
y = 'author',
palette = 'Set2',
data = avg_len_df)
ax.set_title('Author and their average sentence length', fontsize=15)
ax.set_ylabel('Author Name', fontsize=15)
ax.set_xlabel('Length', fontsize=15)
Text(0.5, 0, 'Length')
Then, I plot a figure showing the sentence length comparison on different schools. However, there is no correlation between the sentence length and different type of schools.
# sentence length by schools
plt.figure(figsize=(20,15))
sns.boxplot(y ='school',
x = 'sentence_length',
palette = 'Set2',
data = df)
plt.title('Sentence Length by schools', fontsize = 20)
plt.xlabel('Sentence Length',fontsize=15)
plt.ylabel('Schools',fontsize=15)
Text(0, 0.5, 'Schools')
The following figures are wordclouds for each school. For example, the school of Plato, the most frquent words are: one, thing, will, god,soul...... We can compare the wordclouds from different schools.
from wordcloud import WordCloud, STOPWORDS
schools = df.school.unique()
# create a wordcloud for each school
t1 = time.time()
for i in schools:
df_temp = df[df.school == i]
print('School = ', i.upper(), ':')
text = " ".join(txt for txt in df_temp.sentence_lowered)
wordcloud = WordCloud(width = 800, height = 800,
max_font_size = 50,
max_words= 500,
background_color='white',
stopwords = STOPWORDS).generate(text)
plt.figure(figsize=(15,15))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
t2 = time.time()
print('Elapsed time: ', np.round(t2-t1, 2))
School = PLATO :
School = ARISTOTLE :
School = EMPIRICISM :
School = RATIONALISM :
School = ANALYTIC :
School = CONTINENTAL :